Text analysis: title and abstract of male and female speakers

Abstracts

data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
                   header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date) 
#skimr::skim(data)

Excluding special events as round tables and discussions not related to a project or study presented by someone.

IDs <- c(154, 250, 211, 289, 230, 167, 212)
data <- data %>% filter(!id %in% IDs)

Using abstracts in English (original or translated)

data <- data  %>% filter(!is.na(abstract_english)) 

Number of abstracts per group

table(data$gender)
## 
##   F   M 
##  99 136
table(data$position_cat,data$gender)
##            
##              F  M
##   others     4  1
##   postdoc   21 21
##   professor 21 58
##   student   52 56

Tidytext

text_tok <- data %>% dplyr::select(id,gender,position_cat, audience_n,
                             abstract_english, title_english) %>%
  mutate(text = paste(title_english, abstract_english)) %>%
  unnest_tokens(output=word,input=text)

stop_w <- tibble(word = stopwords("en"))

# remove stopwords
text <- text_tok %>% 
  anti_join(stop_w, by="word")%>% arrange(word) 

# remove other non-words (numbersm characters) and stopwords
text <- text %>% slice(-c(1:285)) %>% # number and some symbols
          filter(nchar(word)!=1) %>% # letters alone
          filter(!word %in% c("mpas", "ÎŽ13c", "ÎČ", "can", "aff", "agb"< "al"))

# solving some simple plurals
plural <- c("actions","advances", "adaptations", "amphibians", "animals",
            "ants","anurans","abundances","adjustments","adults","affects",
            "applications","approaches", "bees","builds", "birds","palms",
            "cerrados","challenges", "outputs",  "queens", "techniques",
            "continents","crops", "consequences", "questions",
            "decisions","declines","determines","determinants", "defenses",
            "dynamics","agroecosystems","benefits","biomes",
            "economics", "ecosystems","environments", "experiences",
            "forests","grasslands","cases","cells","changes","chances",
            "genetics","gifts","gradients","guides","impacts",
            "increases","interactions","lives",
            "landscapes","males","mammals", "mangroves","models","movements",
            "mutualisms","networks","neotropics",
            "opilions","phenotypes","plants","projects","paths", "perspectives","allows","areas", "assemblages","assessments",
            "populations","promotes","relationships", "relations",
            "resources","responses","roads","services","skulls","snakes","seeds",
            "spaces", "spiders","stages", "trees", "variations",
            "threats", "characteristics", "climates","collaborations", "contexts")

text$word[text$word %in% plural] <- 
  substr(text$word[text$word %in% plural],
       1,nchar(text$word[text$word %in% plural])-1)
  • Grouping similar words:
lemma <- rbind(c("adaptive", "adaptation"),
               c("abilities","ability"),
               c("advancement", "advance"),
               c("abundant","abundance"),
                c("academies","academic"),
                c("academic","academic"),
               c("absent","absence"),
                c("activities","activity"),
                c("accomplished","accomplish"),
                c("accounting","account"),
               c("agricultural", "agriculture"),
               c("agro", "agriculture" ),
               c("amazonia","amazon" ),
               c("amazonian","amazon" ),
                c("allowed","allow"),
                c("allowing","allow"),
               c("andean","andes"),
               c("apply","application"),
                c("analysed","analysis"),
                c("analyzed","analysis"),
                c("analyzing","analysis"),
                c("analyses","analysis"),
                c("analytic","analysis"),
                c("analytical","analysis"),
               c("applying","application"),
               c("apidae","apis"),
               c("arachnida","arachnid"),
               c("argue","argument"),
               c("basal", "basis"),
               c("behavioral","behavior"),
               c("behavioural","behavior"),
               c("bignonieae", "bignoniaceae"),
               c("biological", "biology"),
               c("brazilian","brazil"),
                c("brazil's","brazil"),
               c("brazil’s","brazil"),
               c("building","build"),
               c("changing", "change"),
               c("cnidarian", "cnidaria"),
               c("caused", "cause"),
               c("causes","cause"),
               c("causing", "cause"),
               c("coastal","coast"),
                c("changed", "change"),
               c("colour", "color"),
               c("colors", "color"),
               c("communities","community" ),
               c("competitive", "competition"),
               c("complexity", "complex"),
               c("convergences", "convergence"),
               c("convergent", "convergence"),
               c("cordatus","cordata" ),
               c("croplands","crop"),
               c( "cultural", "culture"),
               c("darwin's", "darwin"),
               c("darwinian", "darwin"),
               c("defensive", "defense"),
               c("dependent","dependence"),
               c("detecting","detection"),
               c("determine", "determinant"),
               c("developmental", "development"),
               c("dispersers","dispersal"),
               c("disturbed", "disturbance"),
               c("diversification", "diversity"),
               c("dragonflies", "dragonfly"),
               c("drier", "drought"),
               c("ecological", "ecology"),
               c("ecologists", "ecology"),
               c("endemic", "endemism"),
               c("effectiveness", "efficiency"),
               c("environmental", "environment"),
               c("evolutionary", "evolution"),
               c("expanding", "expansion"),
               c("extinct", "extinction"),
               c("facilitate", "facilitation"),
               c("fisheries", "fishery"),
               c("floral", "flora"),
               c("floristic", "flora"),
               c("forested", "forest"),
               c("functional", "function"),
               c("functionally", "function"),
               c("functioning", "function"),
               c("frequencies", "frequency"),
               c("frequently", "frequency"),
               c("frequent", "frequency"),
               c("geographical", "geographic"),
               c("heterogeneties", "heterogeneity"),
               c("heterogeneous", "heterogeneity"),
               c("histories", "history"),
               c("integrated", "integration"),
               c("intregating", "integration"),
               c("integrative", "integration"),
               c("invasive", "invasion"),
               c("isotopic", "isotope"),
               c("linking", "link"),
               c("living", "live"),
               c("mammalia", "mammal"),
               c("managed", "manage"),
               c("managers", "manage"),
               c("mathematical", "mathematics"),
               c("mates", "mating"),
               c("mediated", "mediate"),
               c("mechanistic", "mechanism"),
               c("matrices", "matrix"),
               c("migratory", "migration"),
               c("mimicking", "mimicry"),
               c("modeling", "model"),
               c("mutualistic", "mutualism"),
               c("natural", "nature"),
               c("neotropical", "neotropic"),
               c("northeastern", "northeast"),
               c("occuring", "occur"),
               c("onça", "onca"),
               c("opiliones", "opilion"),
               c("parasite", "parasitism"),
               c("parent", "parenting"),
               c("phylogenies", "phylogeny"),
               c("phylogenetic", "phylogeny"),
               c("phylogenomic", "phylogeny"),
               c("pollinators", "pollination"),
               c("protected", "protect"),
               c("protective", "protect"),
               c("rainfall", "rain"),
               c("reconstructing", "reconstruction"),
               c("regulatory", "regulation"),
               c("regulates", "regulation"),
               c("relation", "relationship"),
               c("reproductive", "reproduction"),
               c("restored", "restoration"),
               c("robustness", "robust"),
               c("scientific", "science"),
               c("scientist", "science"),
               c("sexy", "sexual"),
               c("simulated", "simulation"),
               c("societies", "society"),
               c("social", "society"),
               c("socio", "society"),
               c("space", "spatial"),
               c("spacio", "spatial"),
               c("stabilize", "stability"),
               c("stable", "stability"),
               c("stories", "story"),
               c("strategic", "strategy"),
               c("strategies", "strategy"),
               c("structured", "structure"),
               c("structuring", "structure"),
               c("studies", "study"),
               c("studing", "study"),
               c("sustainable", "sustainability"),
               c("theories", "theory"),
               c("theoretical", "theory"),
               c("threatened", "threat"),
               c("tropical", "tropic"),
               c("vision", "visual")
               )
lemma <- as.data.frame(lemma)

for (i in 1:dim(lemma)[1]){
  text$word[text$word == lemma[i,1]] <- lemma[i,2]
}

WORDS - all data

table(text$gender)
## 
##     F     M 
## 10502 13215
table(text$position_cat ,text$gender)
##            
##                F    M
##   others     260  137
##   postdoc   2777 2481
##   professor 2055 5093
##   student   5319 5504

Mean number of words by title+abstract

text %>% count(id,gender) %>%
  ggplot(aes(x=gender, y=n)) + 
  geom_violin() + geom_boxplot(width=0.2)+
  ylab("Number of words in title + abtract")

  ggbeeswarm::geom_quasirandom(size=3, shape=21) 
## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_quasirandom

20 more common workds

text %>%
  count(word, sort = TRUE) %>% 
  top_n(20,n)%>%
  kable()
word n
species 384
ecology 185
forest 174
model 157
study 157
environment 139
evolution 134
landscape 127
population 122
area 113
diversity 112
community 100
male 97
plant 97
nature 96
different 95
change 92
patterns 88
present 86
animal 82
interaction 82

Word cloud

textplot_wordcloud(x=dfm(tokens(text$word)))

par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
                   col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
                   col="#FCA532")

Word frequencies by gender

props <- text %>%
  count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]
ggplot(props, aes(x=proportion_M,, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
  #geom_point(size=2.5, alpha=0.5)+
  geom_jitter(size=2.5, alpha=0.2)+
  geom_text_repel(aes(label=label), size=3.2)+
  scale_x_log10(name="Male most used words",
                labels = percent_format()) +
  scale_y_log10(name="Female most used words",
                labels = percent_format()) +
  scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
                       labels=percent_format())  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))

 # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq.jpg", height = 5, width=7)

Words that are close to the dashed line have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.

Correlation of word frequeency use between gender:

cor.test(props$proportion_F, props$proportion_M)
## 
##  Pearson's product-moment correlation
## 
## data:  props$proportion_F and props$proportion_M
## t = 71.063, df = 1615, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8581171 0.8817840
## sample estimates:
##       cor 
## 0.8704527

Highly correlated -> it means they tend to use the same frequency of main word

20 words with the largest differences in frequency

prop2 <- props %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
                     labels = c(0.02,0.01,0,0.01,0.02))

ggsave("figures/abstract_wordFreq_barplot.jpeg", units="in", width=7, height=7, dpi=300)

TF IDF

The statistic tf-idf is intended to measure how important a word is to a document in a collection (or corpus) of documents, for example, to one novel in a collection of novels or to one website in a collection of websites.

Calculating tf-idf attempts to find the words that are important (i.e., common) in a text, but not too common. Let’s do that now.

text_id <- text %>% count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))

10 “exclusive” words for each group

text_id$word <- as.factor(text_id$word)
text_id %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(10, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()

WORDS - professors only data

textP <- text %>% filter(position_cat == "professor")

table(textP$gender)
## 
##    F    M 
## 2055 5093

Mean number of words by abstract

textP %>% count(id,gender) %>%
  ggplot(aes(x=gender, y=n)) + 
  geom_violin() + geom_boxplot(width=0.2)+
  ggbeeswarm::geom_quasirandom(size=3, shape=21) 

20 most commmon words

textP %>%
  count(word, sort = TRUE) %>% 
  top_n(20,n)%>%
  kable()
word n
species 90
ecology 68
environment 52
evolution 52
population 50
plant 43
study 42
model 41
nature 38
ecosystem 37
change 35
diversity 35
pollination 30
research 30
society 30
biology 29
interaction 29
science 29
present 26
landscape 25
may 25
results 25

Words Frequency by gender

propsP <- textP %>%
    count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]
ggplot(propsP, aes(x=proportion_M, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
 # geom_point(size=2.5, alpha=0.3) +
  geom_jitter(size=2.5, alpha=0.3)+
  geom_text_repel(aes(label=label), size=3)+
  scale_x_log10(name="Male most used words",   limits=c(0.0003,0.02),
                labels = percent_format()) +
  scale_y_log10(name="Female Most used words", limits=c(0.0003,0.02),
                labels = percent_format()) +
  scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
                       labels=percent_format())  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))

 # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq_Prof.jpg", height = 5, width=7)

Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females.

Labels for the 20 words with largest differences in frequency.

Correlation of word frequeency use between gender:

cor.test(propsP$proportion_F, propsP$proportion_M)
## 
##  Pearson's product-moment correlation
## 
## data:  propsP$proportion_F and propsP$proportion_M
## t = 20.749, df = 548, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6137225 0.7076573
## sample estimates:
##       cor 
## 0.6632945

20 words with the largest differences in frequency

propP2 <- propsP %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
                     labels = c(0.02,0.01,0,0.01,0.02))

ggsave("figures/abstract_wordFreq_barplot_Prof.jpeg", units="in", width=7, height=7, dpi=300)

TF IDF

text_idP <- textP %>% count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))

10 “exclusive” words for each group

text_idP$word <- as.factor(text_idP$word)
text_idP %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(10, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()

Topic model - all data

LDA - latent Dirichlet allocation method for fiting topic model

It treats each document as a mixture of topics, and each topic as a mixture of words. This allows documents to “overlap” each other in terms of content, rather than being separated into discrete groups, in a way that mirrors typical use of natural lan‐ guage.

Every document is a mixture of topics

Every topic is a mixture of words

matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)

Choosing number of topics: comparing AIC

ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
ap_lda5 <- LDA(matext, k = 5, control = list(seed = 1234))
ap_lda10<- LDA(matext, k = 10, control = list(seed = 1234))
ap_lda20 <- LDA(matext, k = 20, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,ap_lda5, ap_lda10,ap_lda20,
              base=T)
##          AIC      dAIC     df   
## ap_lda2  364708.5      0.0 9539 
## ap_lda3  367793.6   3085.1 14308
## ap_lda4  370805.7   6097.2 19077
## ap_lda5  375098.1  10389.6 23846
## ap_lda10 406169.0  41460.5 47691
## ap_lda20 481100.3 116391.8 95381

two-topics model seems the most plausible model

Word-topic probabilities

10 words with the largest probabilities for each group

ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% 
  ggplot(aes(term, beta, fill = factor(topic))) + 
  geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()

words with the greates difference in Beta between topics

beta_spread <- ap_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))
beta_spread
## # A tibble: 256 × 4
##    term           topic1   topic2 log_ratio
##    <chr>           <dbl>    <dbl>     <dbl>
##  1 abundance   0.00179   0.000769    -1.22 
##  2 action      0.00180   0.000135    -3.73 
##  3 activity    0.000755  0.00148      0.970
##  4 adaptation  0.000327  0.00258      2.98 
##  5 addition    0.00121   0.000884    -0.447
##  6 affect      0.00107   0.00121      0.174
##  7 agriculture 0.00154   0.000775    -0.992
##  8 allow       0.00108   0.00148      0.458
##  9 along       0.0000916 0.00106      3.53 
## 10 also        0.00315   0.00255     -0.310
## # 
 with 246 more rows
beta_spread %>%  
  arrange(log_ratio) %>% slice(c(1:5,260:264)) %>%
  ggplot(aes(fct_reorder(term,log_ratio,min), log_ratio)) + 
  geom_col(show.legend = FALSE) + coord_flip() +
  ylab("Log2 ration of beta in topic 2/topic 1") + xlab("Word")

Document-topic probabilities - classifying the abstracts

and comparing the two groups by gender (if there is a difference in frequency)

ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifi$gender, classifi$topic)
##    
##      1  2
##   F 54 45
##   M 65 71
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>%
  adorn_pct_formatting(digits = 0) %>%
  adorn_ns() %>% kable()
gender 1 2
F 55% (54) 45% (45)
M 48% (65) 52% (71)
classifi %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  facet_wrap(~ gender)

Chi-square test

chisq.test(classifi$gender, classifi$topic)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  classifi$gender and classifi$topic
## X-squared = 0.79212, df = 1, p-value = 0.3735

Topic model - Professors only

matextP <- textP %>% 
  count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)
ap_lda2P <- LDA(matextP, k = 2, control = list(seed = 1234))
ap_lda3P <- LDA(matextP, k = 3, control = list(seed = 1234))
ap_lda4P <- LDA(matextP, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2P, ap_lda3P, ap_lda4P,base=T)
##          AIC      dAIC     df  
## ap_lda2P 107963.8      0.0 4879
## ap_lda3P 109691.5   1727.7 7318
## ap_lda4P 112088.4   4124.6 9757

word-topic probabilities

ap_topicsP <- tidy(ap_lda2P, matrix = "beta")
ap_top_termsP <- ap_topicsP %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_termsP %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()

words with the greates difference in Beta between topics

beta_spread <- ap_topicsP %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))
beta_spread
## # A tibble: 371 × 4
##    term         topic1   topic2 log_ratio
##    <chr>         <dbl>    <dbl>     <dbl>
##  1 ability    0.000941 1.02e- 3     0.114
##  2 abundance  0.000844 2.24e- 3     1.41 
##  3 academic   0.00139  1.14e-43  -133.   
##  4 accepted   0.00139  1.65e-44  -136.   
##  5 account    0.00111  5.62e- 4    -0.985
##  6 across     0.000278 1.97e- 3     2.82 
##  7 action     0.00209  4.20e- 4    -2.32 
##  8 activity   0.00275  3.16e- 4    -3.12 
##  9 adaptation 0.00111  4.22e- 3     1.92 
## 10 advance    0.000278 1.69e- 3     2.60 
## # 
 with 361 more rows
beta_spread %>%  
  arrange(log_ratio) %>% slice(c(1:5,260:264)) %>%
  ggplot(aes(fct_reorder(term,log_ratio,min), log_ratio)) + 
  geom_col(show.legend = FALSE) + coord_flip() +
  ylab("Log2 ration of beta in topic 2/topic 1") + xlab("Word")

Document-topic probabilities

ap_documentsP <- tidy(ap_lda2P, matrix = "gamma")
classifiP <- ap_documentsP %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifiP$gender, classifiP$topic)
##    
##      1  2
##   F 14  7
##   M 30 28
library(janitor)
classifiP %>% tabyl(gender, topic) %>% adorn_percentages() %>% 
  adorn_pct_formatting(digits = 0) %>%
  adorn_ns() %>% kable()
gender 1 2
F 67% (14) 33% (7)
M 52% (30) 48% (28)
classifiP %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  geom_violin()+
  facet_wrap(~ gender)

Chi-square test

chisq.test(classifiP$gender, classifiP$topic)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  classifiP$gender and classifiP$topic
## X-squared = 0.85524, df = 1, p-value = 0.3551

Sentiment analysis

Chapter 2, Silge & RObinson. 2018

  • The NRC lexicon categorizes words in a binary fashion (“yes”/“no”) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.
get_sentiments("nrc")
## # A tibble: 13,875 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # 
 with 13,865 more rows
  • The Bing lexicon categorizes words in a binary fashion into positive and negative categories.
get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # 
 with 6,776 more rows
  • The AFINN lexicon assigns words with a score that runs between -5 and 5, with neg‐ ative scores indicating negative sentiment and positive scores indicating positive sen‐ timent.
get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # 
 with 2,467 more rows

PENSAR: tem que levar em conta nĂșmero de palavras diferentes entre abstracts - principalmente se ouver diferença mĂ©dia de nĂșmero de palavras por abstract de homens e mulehres nĂ©? ou nĂŁo?

Score words difference in female and male abstracts

All data

affword <- get_sentiments("afinn")

affc <- text %>%
  count(id, gender, word, sort = TRUE) %>%
  inner_join(affword, "word")

Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:

affc2 <- affc %>% group_by(id, gender) %>%
  summarise(sum = sum(value*n),
            mean.score = mean(value),
            weig.score = weighted.mean(value,n))
ggplot(affc2, aes(x=gender,y=weig.score)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
    geom_quasirandom()+
  ggtitle("Mean words score per abstract and gender")

ggplot(affc2, aes(x=gender,y=sum)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
    geom_quasirandom()+
  ggtitle("SUM words score per abstract and gender")

Professors

affword <- get_sentiments("afinn")

affcP <- textP %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(affword, "word")

Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:

affc2P <- affcP %>% group_by(id, gender) %>%
  summarise(sum = sum(value*n),
            mean.score = mean(value),
            weig.score = weighted.mean(value,n))
ggplot(affc2P, aes(x=gender,y=weig.score)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
    geom_quasirandom()+
  ggtitle("Mean words score per abstract and gender")

ggplot(affc2P, aes(x=gender,y=sum)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
    geom_quasirandom()+
  ggtitle("SUM words score per abstract and gender")

Frequency of sentiment words per abstract

As classificaçÔes das palavras não me parecem muito acuradas com a linguagem científica.

Precisa saber como ponderar pelo total de palavras.

All data

nrcword <- get_sentiments("nrc")

nrc <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(nrcword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(nrc, aes(x=gender, y=n)) +
  facet_wrap(~sentiment) +
  geom_violin() +
    geom_quasirandom()

text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(nrcword, "word") %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  group_by(gender,sentiment, word) %>%
  summarise(n= sum(n)) %>% 
  group_by(gender, sentiment) %>% top_n(5,n) %>%
  ggplot(aes(word, n, fill=sentiment)) + geom_col(show.legend = FALSE) +
facet_grid(sentiment~ gender, scales = "free") + coord_flip()

Professors

nrcword <- get_sentiments("nrc")

nrc <- textP %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(nrcword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(nrc, aes(x=gender, y=n)) +
  facet_wrap(~sentiment) +
  geom_violin()+
    geom_quasirandom()

nrc %>% filter(sentiment == "positive") %>%
ggplot( aes(x=gender, y=n)) +
  geom_violin() +
  geom_boxplot(width=0.2) +
    geom_quasirandom()+
  ggtitle("Positive words")

textP %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(nrcword, "word") %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  group_by(gender,sentiment, word) %>%
  summarise(n= sum(n)) %>% 
  group_by(gender, sentiment) %>% top_n(5,n) %>%
  ggplot(aes(word, n, fill=sentiment)) + geom_col(show.legend = FALSE) +
facet_grid(sentiment~ gender, scales = "free") + coord_flip()

Frequency of sentiment words per abstract

All data

bingword <- get_sentiments("bing")

bing <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(bingword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(bing, aes(x=sentiment, y=n)) +
  facet_wrap(~gender) +
  geom_violin()+
    geom_quasirandom()

most common positive and negative words by gender

text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(bingword, "word") %>%
  group_by(gender, sentiment, word) %>% 
  summarise(n=sum(n)) %>%
  group_by(gender, sentiment) %>% top_n(5,n) %>%
  ggplot(aes(word, n, fill=sentiment)) + geom_col(show.legend = FALSE) +
facet_grid(sentiment~ gender, scales = "free") + coord_flip()

Professors

bingword <- get_sentiments("bing")

bing <- textP %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(bingword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(bing, aes(x=gender, y=n)) +
  facet_wrap(~sentiment) +
  geom_violin() +
  geom_boxplot(width=0.2) +
    geom_quasirandom()

textP %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(bingword, "word") %>%
  group_by(gender, sentiment, word) %>% 
  summarise(n=sum(n)) %>%
  group_by(gender, sentiment) %>% top_n(5,n) %>%
  ggplot(aes(word, n, fill=sentiment)) + geom_col(show.legend = FALSE) +
facet_grid(sentiment~ gender, scales = "free") + coord_flip()